from bs4 import BeautifulSoup
import requests
#目的：爬取红楼梦的章节标题和章节内容
#对首页的页面数据进行爬取
url="http://guoxue.lishichunqiu.com/gdxs/hongloumeng/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"}
page_text=requests.get(url=url,headers=headers).text


#1，实例化BeautifulSoup的对象，需要将页面源码数据加载到该页面
soup=BeautifulSoup(page_text,"lxml")
#解析章节标题和详情页的url
a_list=soup.select(".line_bottom a")
fp = open("./hongloumeng.txt","w",encoding="utf-8")
for a in a_list:
    title=a["title"]
    content_url = a["href"]
    #对详情页发起请求，解析出章节内容
    content=requests.get(url=content_url,headers=headers).text
    #解析出详情页中的相关章节内容
    detail_soup = BeautifulSoup(content,"lxml")
    content_text = detail_soup.find(id="content").text
    #写入文件
    fp.write(title+":"+content_text+"\n")
    print(title,"爬取成功！")
fp.close()





